In [6]:
library(tidyverse)
library(car)        # for VIF
library(glmnet)     # for LASSO
library(caret)      # for preprocessing
library(readr)
library(dplyr)
library(broom)
library(ggplot2)
library(pROC)
library(stringr)
library(patchwork)
In [7]:
data_url <- "https://raw.githubusercontent.com/audracornick/STAT301_Group24/main/online_shoppers_intention.csv"

OSPI <- read_csv(data_url)

head(OSPI)
Rows: 12330 Columns: 18
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (2): Month, VisitorType
dbl (14): Administrative, Administrative_Duration, Informational, Informatio...
lgl  (2): Weekend, Revenue

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
A tibble: 6 × 18
AdministrativeAdministrative_DurationInformationalInformational_DurationProductRelatedProductRelated_DurationBounceRatesExitRatesPageValuesSpecialDayMonthOperatingSystemsBrowserRegionTrafficTypeVisitorTypeWeekendRevenue
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><chr><dbl><dbl><dbl><dbl><chr><lgl><lgl>
0000 1 0.0000000.200000000.200000000Feb1111Returning_VisitorFALSEFALSE
0000 2 64.0000000.000000000.100000000Feb2212Returning_VisitorFALSEFALSE
0000 1 0.0000000.200000000.200000000Feb4193Returning_VisitorFALSEFALSE
0000 2 2.6666670.050000000.140000000Feb3224Returning_VisitorFALSEFALSE
000010627.5000000.020000000.050000000Feb3314Returning_Visitor TRUEFALSE
000019154.2166670.015789470.024561400Feb2213Returning_VisitorFALSEFALSE
In [8]:
OSPI_clean <- OSPI %>%
  filter(Region != 1) %>%
  mutate(
    Revenue = as.factor(Revenue),
    Month = as.factor(Month),
    VisitorType = as.factor(VisitorType),
    Weekend = as.factor(Weekend),
    SpecialDay = as.factor(SpecialDay),
    Browser = as.factor(Browser),
    TrafficType = as.factor(TrafficType),
    OperatingSystems = as.factor(OperatingSystems),
    Region         = as.factor(Region)
  ) 

# Quick check
summary(OSPI_clean)
 Administrative   Administrative_Duration Informational    
 Min.   : 0.000   Min.   :   0.00         Min.   : 0.0000  
 1st Qu.: 0.000   1st Qu.:   0.00         1st Qu.: 0.0000  
 Median : 1.000   Median :  10.00         Median : 0.0000  
 Mean   : 2.343   Mean   :  81.56         Mean   : 0.4771  
 3rd Qu.: 4.000   3rd Qu.:  97.46         3rd Qu.: 0.0000  
 Max.   :27.000   Max.   :2156.17         Max.   :14.0000  
                                                           
 Informational_Duration ProductRelated   ProductRelated_Duration
 Min.   :   0.00        Min.   :  0.00   Min.   :    0.0        
 1st Qu.:   0.00        1st Qu.:  7.00   1st Qu.:  189.8        
 Median :   0.00        Median : 18.00   Median :  603.1        
 Mean   :  32.69        Mean   : 30.42   Mean   : 1143.2        
 3rd Qu.:   0.00        3rd Qu.: 37.00   3rd Qu.: 1448.9        
 Max.   :2252.03        Max.   :584.00   Max.   :24844.2        
                                                                
  BounceRates         ExitRates         PageValues      SpecialDay
 Min.   :0.000000   Min.   :0.00000   Min.   :  0.000   0  :6793  
 1st Qu.:0.000000   1st Qu.:0.01405   1st Qu.:  0.000   0.2: 111  
 Median :0.002995   Median :0.02576   Median :  0.000   0.4: 147  
 Mean   :0.022256   Mean   :0.04304   Mean   :  5.881   0.6: 199  
 3rd Qu.:0.017204   3rd Qu.:0.05000   3rd Qu.:  0.000   0.8: 197  
 Max.   :0.200000   Max.   :0.20000   Max.   :361.764   1  : 103  
                                                                  
     Month      OperatingSystems    Browser         Region      TrafficType  
 May    :2137   2      :4091     2      :4906   3      :2403   2      :2370  
 Nov    :1710   3      :1594     1      :1417   4      :1182   1      :1576  
 Mar    :1129   1      :1519     4      : 458   2      :1136   3      :1171  
 Dec    :1101   4      : 252     5      : 292   6      : 805   4      : 677  
 Oct    : 334   8      :  69     10     : 110   7      : 761   13     : 466  
 Sep    : 292   6      :  17     6      :  99   9      : 511   6      : 296  
 (Other): 847   (Other):   8     (Other): 268   (Other): 752   (Other): 994  
            VisitorType    Weekend      Revenue    
 New_Visitor      :1037   FALSE:5759   FALSE:6413  
 Other            :  77   TRUE :1791   TRUE :1137  
 Returning_Visitor:6436                            
                                                   
                                                   
                                                   
                                                   
In [9]:
df <- OSPI_clean
head(df)
A tibble: 6 × 18
AdministrativeAdministrative_DurationInformationalInformational_DurationProductRelatedProductRelated_DurationBounceRatesExitRatesPageValuesSpecialDayMonthOperatingSystemsBrowserRegionTrafficTypeVisitorTypeWeekendRevenue
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><fct><fct><fct><fct><fct><fct><fct><fct><fct>
0000 1 0.0000000.200000.2000000000 Feb4193Returning_VisitorFALSEFALSE
0000 2 2.6666670.050000.1400000000 Feb3224Returning_VisitorFALSEFALSE
0000 1 0.0000000.200000.2000000000.4Feb2433Returning_VisitorFALSEFALSE
0000 2 37.0000000.000000.1000000000.8Feb2223Returning_VisitorFALSEFALSE
0000 3395.0000000.000000.0666666700 Feb1133Returning_VisitorFALSEFALSE
000016407.7500000.018750.0258333300.4Feb1143Returning_VisitorFALSEFALSE
In [10]:
# Plot 1: Revenue by Weekend & Visitor Type
p1 <- ggplot(df, aes(x = Weekend, fill = Revenue)) +
geom_bar(position = "fill") +
facet_wrap(~VisitorType,
labeller = labeller(VisitorType = c(
"New_Visitor" = "New",
"Other_Visitor" = "Other",
"Returning_Visitor" = "Return"
))) +
labs(y = "Proportion", title = "Revenue by Weekend & Visitor Type") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))

# Plot 2: Revenue by Browser and Region (summary stats)
df_summary2 <- df %>%
  group_by(Browser, Region) %>%
  summarise(revenue_rate = mean(as.numeric(Revenue) - 1), .groups = "drop")

p2 <- ggplot(df_summary2, aes(x = reorder(Browser, revenue_rate), y = revenue_rate, fill = Region)) +
  geom_col(position = "dodge") +
  labs(y = "Revenue Rate", title = "Revenue Rate by Browser & Region", x = "Browser") +
  theme_minimal() +
  theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 8), # smaller x-axis labels
axis.text.y = element_text(size = 8), # smaller y-axis labels
axis.title = element_text(size = 10), # smaller axis titles
plot.title = element_text(size = 12) # smaller plot title
)

# Plot 3: PageValues vs ProductRelated color by Revenue
p3 <- ggplot(df, aes(x = ProductRelated, y = PageValues, color = Revenue)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "PageValues vs ProductRelated by Revenue") +
theme_minimal()

# Plot 4: Revenue vs Traffic Type and Weekend (summary stats)
df_summary4 <- df %>%
group_by(TrafficType, Weekend) %>%
summarise(revenue_rate = mean(as.numeric(Revenue) - 1), .groups = "drop")

p4 <- ggplot(df_summary4, aes(x = TrafficType, y = revenue_rate, fill = Weekend)) +
geom_col(position = "dodge") +
labs(y = "Revenue Rate", x = "Traffic Type", title = "Revenue Rate by Traffic Type & Weekend") +
theme_minimal() +
theme(
axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8),
axis.title = element_text(size = 10),
plot.title = element_text(size = 12)
)
In [11]:
# Combine all plots
multiplot <- (p1 | p2) / (p3 | p4) +
plot_annotation(
title = "Exploratory Analysis of Revenue in Shop Data",
caption = "Figure A: Revenue by Weekend & Visitor Type\nFigure B: Revenue Rate by Browser & Region\nFigure C: PageValues vs ProductRelated by Revenue\nFigure D: Revenue Rate by Traffic Type & Weekend",
tag_levels = 'A' # Automatically labels subplots A, B, C, D
) & theme(
plot.title = element_text(size = 11
                          , face = "bold"),
plot.caption = element_text(size = 8),
plot.tag = element_text(size = 10, face = "bold")
)

multiplot
`geom_smooth()` using formula = 'y ~ x'
No description has been provided for this image
In [ ]: